{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 程序功能：对前三步处理后的数据集KDDCUP99，使用支持向量机SVM实现分类并输出评价指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import metrics\n",
    "from sklearn import svm\n",
    "from time import *\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.preprocessing import OrdinalEncoder"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 输出python和package的版本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "platform  version: 1.0.8\n",
      "python version: 3.6.12\n",
      "csv version: 1.0\n",
      "numpy version: 1.19.2\n",
      "pandas version: 1.1.5\n",
      "scikit-learn version: 0.23.2\n",
      "matplotlib version: 3.3.2\n",
      "IPython version: 7.16.1\n"
     ]
    }
   ],
   "source": [
    "import platform \n",
    "print(\"platform  version: {}\".format(platform .__version__))\n",
    "print(\"python version: {}\".format(platform.python_version()))\n",
    "import csv\n",
    "print(\"csv version: {}\".format(csv.__version__))\n",
    "import numpy as np\n",
    "print(\"numpy version: {}\".format(np.__version__))\n",
    "import pandas as pd\n",
    "print(\"pandas version: {}\".format(pd.__version__))\n",
    "import sklearn\n",
    "print(\"scikit-learn version: {}\".format(sklearn.__version__))\n",
    "import matplotlib\n",
    "print(\"matplotlib version: {}\".format(matplotlib.__version__))\n",
    "import IPython\n",
    "print(\"IPython version: {}\".format(IPython.__version__))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取、处理数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "b'Skipping line 4817100: expected 42 fields, saw 56\\n'\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据集大小： (4898429, 42)\n",
      "数据集特征大小： (4898429, 10)\n",
      "数据集标签大小： 4898429\n"
     ]
    }
   ],
   "source": [
    "fr = pd.read_csv(\"kddcup.data.txt\", encoding='utf-8',error_bad_lines=False)\n",
    "data = np.array(fr)\n",
    "print('数据集大小：',data.shape)\n",
    "\n",
    "data[:,-1] = LabelEncoder().fit_transform(data[:,-1])        # 标签的编码\n",
    "data[:,0:-1] = OrdinalEncoder().fit_transform(data[:,0:-1])  # 特征的分类编码\n",
    "data = StandardScaler().fit_transform(data)                  # 标准化：利用Sklearn库的StandardScaler对数据标准化\n",
    "# data = MinMaxScaler().fit_transform(data)                  # 归一化：利用Sklearn库的MinMaxScaler对数据归一化，返回[0,1]区间的数据\n",
    "\n",
    "# 划分特征和标签\n",
    "line_nums = len(data)\n",
    "# data_feature = np.zeros((line_nums, 41))   # 创建line_nums行 41列的矩阵\n",
    "data_feature = np.zeros((line_nums, 10))   # 创建line_nums行 10列的矩阵\n",
    "data_labels = []\n",
    "for i in range(line_nums):                 # 依次读取每行\n",
    "#     data_feature[i,:] = data[i][0:41]     # 选择前41个特征  划分数据集特征和标签\n",
    "    feature = [3,4,5,6,8,10,13,23,24,37]   # 选择第3,4,5,6,8,10,13,23,24,37这10个特征分类\n",
    "    for j in feature:\n",
    "        data_feature[i,feature.index(j)] = data[i][j]\n",
    "        \n",
    "    data_labels.append(data[i][-1])       # 标签\n",
    "    \n",
    "print('数据集特征大小：',data_feature.shape)\n",
    "print('数据集标签大小：',len(data_labels))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 划分训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集特征大小：(2939057, 10)，训练集标签大小：(2939057,)\n",
      "测试集特征大小：(1959372, 10)，测试集标签大小：(1959372,)\n"
     ]
    }
   ],
   "source": [
    "data_label = []\n",
    "for i in data_labels: \n",
    "    data_label.append(int(float(i)))\n",
    "data_label =  np.array(data_label, dtype = int)       # list转换数组\n",
    "train_feature, test_feature, train_label, test_label = train_test_split(data_feature, data_label,test_size=0.4,random_state=4)# 测试集40%\n",
    "print('训练集特征大小：{}，训练集标签大小：{}'.format(train_feature.shape, train_label.shape))\n",
    "print('测试集特征大小：{}，测试集标签大小：{}'.format(test_feature.shape, test_label.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 模型训练、预测"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 方法一：决策树DT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start training DT：DecisionTreeClassifier(max_depth=20)\n",
      "Training done！\n",
      "Start prediction DT：\n",
      "Prediction done！\n",
      "预测结果： [0 0 0 ... 0 0 0]\n",
      "实际结果： [0 0 0 ... 0 0 0]\n",
      "正确预测的数量： 1949172\n",
      "训练预测耗时： 24.315146446228027 s\n"
     ]
    }
   ],
   "source": [
    "begin_time = time()                     # 训练预测开始时间\n",
    "if __name__ == '__main__':\n",
    "    print('Start training DT：',end='')\n",
    "    dt = sklearn.tree.DecisionTreeClassifier(criterion='gini',splitter='best', max_depth=20, min_samples_split=2, min_samples_leaf =1)\n",
    "    dt.fit(train_feature, train_label)\n",
    "    print(dt)\n",
    "    print('Training done！')\n",
    "\n",
    "    print('Start prediction DT：')\n",
    "    test_predict = dt.predict(test_feature)\n",
    "    print('Prediction done！')\n",
    "\n",
    "    print('预测结果：',test_predict)\n",
    "    print('实际结果：',test_label)\n",
    "    print('正确预测的数量：',sum(test_predict==test_label)) \n",
    "end_time = time()                        # 训练预测结束时间\n",
    "total_time = end_time - begin_time\n",
    "print('训练预测耗时：',total_time,'s')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 方法二：支持向量机SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # begin_time = time()                         # 训练预测开始时间\n",
    "# if __name__ == '__main__':\n",
    "#     print('Start training SVM：',end='')\n",
    "#     svm = sklearn.svm.SVC(kernel='rbf', C=1.0, gamma=1.5, tol=1e-2)  # 选择高斯核函数rbf，正则化系数为1，核函数系数0.5，SMO迭代精度1e-2\n",
    "#     svm.fit(train_feature, train_label)      # 开始训练SVM\n",
    "#     print(svm)\n",
    "#     print('Training done！')\n",
    "    \n",
    "#     print('Start prediction SVM：')\n",
    "#     test_predict = svm.predict(test_feature) # 对测试集进行类别预测\n",
    "#     print('Prediction done！')\n",
    "    \n",
    "#     print('预测结果：',test_predict)\n",
    "#     print('实际结果：',test_label)\n",
    "#     print('正确预测的数量：',sum(test_predict==test_label)) \n",
    "# # end_time = time                             # 训练预测结束时间\n",
    "# # total_time = end_time - begin_time\n",
    "# # print('训练预测耗时：',total_time,'s')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 输出分类报告"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "准确率: 0.99479425040268\n",
      "宏平均精确率: 0.9757850958567713\n",
      "微平均精确率: 0.99479425040268\n",
      "宏平均召回率: 0.9802354835751075\n",
      "平均F1-score: 0.9947985471535458\n",
      "混淆矩阵输出:\n",
      "[[    883       0       0       8       0]\n",
      " [      0    4705       3     315       0]\n",
      " [      0     434  425276    4120       0]\n",
      " [      4      33    5265 1517515       8]\n",
      " [      1       0       1       8     793]]\n",
      "分类报告:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "          -3       0.99      0.99      0.99       891\n",
      "          -2       0.91      0.94      0.92      5023\n",
      "          -1       0.99      0.99      0.99    429830\n",
      "           0       1.00      1.00      1.00   1522825\n",
      "           1       0.99      0.99      0.99       803\n",
      "\n",
      "    accuracy                           0.99   1959372\n",
      "   macro avg       0.98      0.98      0.98   1959372\n",
      "weighted avg       0.99      0.99      0.99   1959372\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('准确率:', metrics.accuracy_score(test_label, test_predict))                         # 预测准确率输出\n",
    "print('宏平均精确率:',metrics.precision_score(test_label,test_predict,average='macro'))    # 预测宏平均精确率输出\n",
    "print('微平均精确率:', metrics.precision_score(test_label, test_predict, average='micro')) # 预测微平均精确率输出\n",
    "print('宏平均召回率:',metrics.recall_score(test_label,test_predict,average='macro'))       # 预测宏平均召回率输出\n",
    "print('平均F1-score:',metrics.f1_score(test_label,test_predict,average='weighted'))        # 预测平均f1-score输出\n",
    "print('混淆矩阵输出:')\n",
    "print(metrics.confusion_matrix(test_label,test_predict))                                   # 混淆矩阵输出\n",
    "# 从精确率:precision、召回率:recall、 调和平均f1值:f1-score和支持度:support四个维度进行衡量\n",
    "print('分类报告:')\n",
    "print(metrics.classification_report(test_label, test_predict))                             # 分类报告输出"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
